{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Mask R-CNN Distributed training on Azure Machine Learning\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n",
      "SDK version: 1.1.5\n"
     ]
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "\n",
    "import os\n",
    "from azureml.core import (Workspace, Experiment, \n",
    "                          VERSION, Datastore)\n",
    "from azureml.core.compute import ComputeTarget, AmlCompute\n",
    "from azureml.train.estimator import Estimator, Mpi\n",
    "from azureml.core.compute_target import ComputeTargetException\n",
    "from azureml.widgets import RunDetails\n",
    "\n",
    "\n",
    "SUBSCRIPTION_ID = \"\"\n",
    "RESOURCE_GROUP = \"\"\n",
    "WORKSPACE_NAME = \"\"\n",
    "EXP_NAME = 'Azureml-maskRCNN'\n",
    "CLUSTER_NAME = \"gpu-cluster\"\n",
    "NODE_COUNT = 8\n",
    "TRAINING_DIR = os.path.join(os.getcwd(),'horovod')\n",
    "\n",
    "\n",
    "\n",
    "print(\"SDK version:\", VERSION)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "ws = Workspace(subscription_id = SUBSCRIPTION_ID, \n",
    "               resource_group =RESOURCE_GROUP , \n",
    "               workspace_name = WORKSPACE_NAME\n",
    "              )\n",
    "\n",
    "\n",
    "exp = Experiment(workspace=ws, name=EXP_NAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found existing compute target.\n",
      "Checking cluster status...\n",
      "Succeeded\n",
      "AmlCompute wait for completion finished\n",
      "\n",
      "Minimum number of nodes requested have been provisioned\n"
     ]
    }
   ],
   "source": [
    "found = False\n",
    "cts = ws.compute_targets\n",
    "if CLUSTER_NAME in cts and cts[CLUSTER_NAME].type == 'AmlCompute':\n",
    "    found = True\n",
    "    print('Found existing compute target.')\n",
    "    compute_target = cts[CLUSTER_NAME]\n",
    "\n",
    "if not found:\n",
    "    print('Creating a new compute target...')\n",
    "    provisioning_config = AmlCompute.provisioning_configuration(vm_size =  'Standard_NC12',max_nodes = NODE_COUNT)\n",
    "\n",
    "    # Create the cluster.\\n\",\n",
    "    compute_target = ComputeTarget.create(ws, CLUSTER_NAME, provisioning_config)\n",
    "\n",
    "print('Checking cluster status...')\n",
    "compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Overwriting run.py\n"
     ]
    }
   ],
   "source": [
    "%%writefile run.py\n",
    "\n",
    "import os\n",
    "import shutil,glob\n",
    "import argparse\n",
    "from git.repo.base import Repo\n",
    "\n",
    "\n",
    "SRC_DIR = '/maskrcnn'\n",
    "SAMPLES_DIR = os.path.join(SRC_DIR,'samples')\n",
    "LOGS_DIR = os.path.join(os.getcwd(),'logs')\n",
    "os.makedirs(LOGS_DIR, exist_ok = True)\n",
    "\n",
    "parser = argparse.ArgumentParser()\n",
    "parser.add_argument('--is_distributed', type=bool,help='Distributed training')\n",
    "\n",
    "args = parser.parse_args()\n",
    "is_distributed = args.is_distributed\n",
    "\n",
    "#=====Clone distributed training implementation of Mask_RCNN==========\n",
    "\n",
    "REPO_URL=\"https://github.com/datashinobi/Mask_RCNN.git\"\n",
    "BRANCH='yassine/horovod'\n",
    "\n",
    "if os.path.exists(SRC_DIR):\n",
    "    print(\"Repo exists, skip cloning\")\n",
    "else:\n",
    "    print('Clonerepo..........')\n",
    "    Repo.clone_from(REPO_URL,SRC_DIR, branch=BRANCH)\n",
    "\n",
    "#=====move training code to source dir=====\n",
    "shutil.copytree(os.path.join(os.getcwd(),'horovod'), os.path.join(SAMPLES_DIR,'horovod'))  \n",
    "\n",
    "os.chdir(os.path.join(SAMPLES_DIR,'horovod'))\n",
    "\n",
    "from train import run\n",
    "run(is_distributed,LOGS_DIR)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Overwriting /extdrive1/home/sasuke/dev/amlsamples/maskRCNN_distributed/horovod/train.py\n"
     ]
    }
   ],
   "source": [
    "%%writefile $TRAINING_DIR/train.py\n",
    "\n",
    "import os\n",
    "ROOT_DIR = os.path.abspath(\"../../\")\n",
    "\n",
    "def run(is_distributed,logs_dir):\n",
    "    from dataset import ShapesDataset\n",
    "    from mrcnn.config import Config\n",
    "    \n",
    "    ######################\n",
    "    class ShapesConfig(Config):\n",
    "        NAME = \"shapes\"\n",
    "        GPU_COUNT = 2\n",
    "        IMAGES_PER_GPU =2 \n",
    "        NUM_CLASSES = 1 + 3\n",
    "        IMAGE_MIN_DIM = 128\n",
    "        IMAGE_MAX_DIM = 128\n",
    "        RPN_ANCHOR_SCALES = (8, 16, 32, 64, 128)\n",
    "        TRAIN_ROIS_PER_IMAGE = 32\n",
    "        STEPS_PER_EPOCH = 10\n",
    "        VALIDATION_STEPS = 5\n",
    "\n",
    "    config = ShapesConfig()\n",
    "    config.display()\n",
    "    \n",
    "    # Training dataset\n",
    "    dataset_train = ShapesDataset()\n",
    "    dataset_train.load_shapes(500000, config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1])\n",
    "    dataset_train.prepare()\n",
    "\n",
    "    # Validation dataset\n",
    "    dataset_val = ShapesDataset()\n",
    "    dataset_val.load_shapes(5000, config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1])\n",
    "    dataset_val.prepare()\n",
    "    if is_distributed:\n",
    "        import mrcnn.distributed_model as modellib\n",
    "    else:\n",
    "        import mrcnn.model as modellib\n",
    "        \n",
    "    from mrcnn import utils\n",
    "\n",
    "    # Local path to trained weights file\n",
    "    COCO_MODEL_PATH = os.path.join(ROOT_DIR, \"mask_rcnn_coco.h5\")\n",
    "\n",
    "    # Download COCO trained weights from Releases if needed\n",
    "    if not os.path.exists(COCO_MODEL_PATH):\n",
    "        utils.download_trained_weights(COCO_MODEL_PATH)\n",
    "\n",
    "\n",
    "    # number of found devices by TF\n",
    "    from tensorflow.python.client import device_lib\n",
    "    device_lib.list_local_devices()\n",
    "    \n",
    "    # Create model in training mode\n",
    "    model = modellib.MaskRCNN(\"training\", config, logs_dir)\n",
    "\n",
    "    # Load weights trained on MS COCO, but skip layers that\n",
    "        # are different due to the different number of classes\n",
    "        # See README  @ https://github.com/matterport/Mask_RCNNfor instructions to download the COCO weights\n",
    "    model.load_weights(COCO_MODEL_PATH, by_name=True,\n",
    "                       exclude=[\"mrcnn_class_logits\", \"mrcnn_bbox_fc\", \n",
    "                                \"mrcnn_bbox\", \"mrcnn_mask\"])\n",
    "\n",
    "\n",
    "    \n",
    "    model.train(dataset_train, dataset_val, \n",
    "                learning_rate=config.LEARNING_RATE, \n",
    "                epochs=1000, \n",
    "                layers='heads')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "script_params = {'--is_distributed':True}\n",
    "\n",
    "from azureml.train.dnn import TensorFlow \n",
    "estimator = TensorFlow(source_directory=os.getcwd(),\n",
    "                       compute_target=compute_target,\n",
    "                       entry_script='run.py',\n",
    "                       pip_requirements_file=\"requirements.txt\",\n",
    "                       node_count=NODE_COUNT,\n",
    "                       distributed_training=Mpi(),\n",
    "                       use_gpu=True,\n",
    "                       script_params=script_params,\n",
    "                       framework_version='1.10'\n",
    "                      )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table style=\"width:100%\"><tr><th>Experiment</th><th>Id</th><th>Type</th><th>Status</th><th>Details Page</th><th>Docs Page</th></tr><tr><td>Azureml-maskRCNN</td><td>Azureml-maskRCNN_1585167843_7b6e658f</td><td>azureml.scriptrun</td><td>Starting</td><td><a href=\"https://ml.azure.com/experiments/Azureml-maskRCNN/runs/Azureml-maskRCNN_1585167843_7b6e658f?wsid=/subscriptions/fe375bc2-9f1a-4909-ad0d-9319806d5e97/resourcegroups/adb_rg/workspaces/repro\" target=\"_blank\" rel=\"noopener\">Link to Azure Machine Learning studio</a></td><td><a href=\"https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.script_run.ScriptRun?view=azure-ml-py\" target=\"_blank\" rel=\"noopener\">Link to Documentation</a></td></tr></table>"
      ],
      "text/plain": [
       "Run(Experiment: Azureml-maskRCNN,\n",
       "Id: Azureml-maskRCNN_1585167843_7b6e658f,\n",
       "Type: azureml.scriptrun,\n",
       "Status: Starting)"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "run = exp.submit(estimator)\n",
    "run"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Start tensorboad to track run metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from azureml.tensorboard import Tensorboard\n",
    "\n",
    "tb = Tensorboard([run])\n",
    "tb.start()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tb.stop()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:amlsamples_env]",
   "language": "python",
   "name": "conda-env-amlsamples_env-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
